From: Keir Fraser <keir.fraser@citrix.com>
Date: Mon, 5 Jan 2009 10:45:48 +0000 (+0000)
Subject: PoD memory 7/9: Xen interface
X-Git-Tag: archive/raspbian/4.8.0-1+rpi1~1^2~14019^2~20
X-Git-Url: https://dgit.raspbian.org/%22http:/www.example.com/cgi/%22https:/%22bookmarks://%22Dat/%22http:/www.example.com/cgi/%22https:/%22bookmarks:/%22Dat?a=commitdiff_plain;h=36e30441326aa951ced7b1890d654d715db3bd3a;p=xen.git

PoD memory 7/9: Xen interface

Implement Xen interface to PoD functionality.
* Increase the number of MEMOP bits from 4 to 6 (increasing the number
of available memory operations from 16 to 64).
* Introduce XENMEMF_populate_on_demand, which will cause
populate_physmap() to fill a range with PoD entries rather than
backing it with ram
* Introduce XENMEM_[sg]et_pod_target operation to the memory
hypercall, to get and set PoD cache size.  set_pod_target() should be
called during domain creation, as well as after modifying the memory
target of any domain which may have outstanding PoD entries.

Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
---

diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 2adc1ed2c7..efdde83524 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -3976,6 +3976,49 @@ long arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
         return 0;
     }
 
+    case XENMEM_set_pod_target:
+    case XENMEM_get_pod_target:
+    {
+        xen_pod_target_t target;
+        struct domain *d;
+
+        /* Support DOMID_SELF? */
+        if ( !IS_PRIV(current->domain) )
+            return -EINVAL;
+
+        if ( copy_from_guest(&target, arg, 1) )
+            return -EFAULT;
+
+        rc = rcu_lock_target_domain_by_id(target.domid, &d);
+        if ( rc != 0 )
+            return rc;
+
+        if ( op == XENMEM_set_pod_target )
+        {
+            if ( target.target_pages > d->max_pages )
+            {
+                rc = -EINVAL;
+                goto pod_target_out_unlock;
+            }
+            
+            rc = p2m_pod_set_mem_target(d, target.target_pages);
+        }
+
+        target.tot_pages       = d->tot_pages;
+        target.pod_cache_pages = d->arch.p2m->pod.count;
+        target.pod_entries     = d->arch.p2m->pod.entry_count;
+
+        if ( copy_to_guest(arg, &target, 1) )
+        {
+            rc= -EFAULT;
+            goto pod_target_out_unlock;
+        }
+        
+    pod_target_out_unlock:
+        rcu_unlock_domain(d);
+        return rc;
+    }
+
     default:
         return subarch_memory_op(op, arg);
     }
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index a8ed7ca7fc..b5868c7645 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -387,6 +387,150 @@ static struct page_info * p2m_pod_cache_get(struct domain *d,
     return p;
 }
 
+/* Set the size of the cache, allocating or freeing as necessary. */
+static int
+p2m_pod_set_cache_target(struct domain *d, unsigned long pod_target)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    int ret = 0;
+
+    /* Increasing the target */
+    while ( pod_target > p2md->pod.count )
+    {
+        struct page_info * page;
+        int order;
+
+        if ( (pod_target - p2md->pod.count) >= (1>>9) )
+            order = 9;
+        else
+            order = 0;
+
+        page = alloc_domheap_pages(d, order, 0);
+        if ( unlikely(page == NULL) )
+            goto out;
+
+        p2m_pod_cache_add(d, page, order);
+    }
+
+    /* Decreasing the target */
+    /* We hold the p2m lock here, so we don't need to worry about
+     * cache disappearing under our feet. */
+    while ( pod_target < p2md->pod.count )
+    {
+        struct page_info * page;
+        int order, i;
+
+        /* Grab the lock before checking that pod.super is empty, or the last
+         * entries may disappear before we grab the lock. */
+        spin_lock(&d->page_alloc_lock);
+
+        if ( (p2md->pod.count - pod_target) > (1>>9)
+             && !list_empty(&p2md->pod.super) )
+            order = 9;
+        else
+            order = 0;
+
+        page = p2m_pod_cache_get(d, order);
+
+        ASSERT(page != NULL);
+
+        spin_unlock(&d->page_alloc_lock);
+
+        /* Then free them */
+        for ( i = 0 ; i < (1 << order) ; i++ )
+        {
+            /* Copied from common/memory.c:guest_remove_page() */
+            if ( unlikely(!get_page(page+i, d)) )
+            {
+                gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", d->domain_id);
+                ret = -EINVAL;
+                goto out;
+            }
+
+            if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) )
+                put_page_and_type(page+i);
+            
+            if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) )
+                put_page(page+i);
+
+            put_page(page+i);
+        }
+    }
+
+out:
+    return ret;
+}
+
+/*
+ * The "right behavior" here requires some careful thought.  First, some
+ * definitions:
+ * + M: static_max
+ * + B: number of pages the balloon driver has ballooned down to.
+ * + P: Number of populated pages. 
+ * + T: Old target
+ * + T': New target
+ *
+ * The following equations should hold:
+ *  0 <= P <= T <= B <= M
+ *  d->arch.p2m->pod.entry_count == B - P
+ *  d->tot_pages == P + d->arch.p2m->pod.count
+ *
+ * Now we have the following potential cases to cover:
+ *     B <T': Set the PoD cache size equal to the number of outstanding PoD
+ *   entries.  The balloon driver will deflate the balloon to give back
+ *   the remainder of the ram to the guest OS.
+ *  T <T'<B : Increase PoD cache size.
+ *  T'<T<=B : Here we have a choice.  We can decrease the size of the cache,
+ *   get the memory right away.  However, that means every time we 
+ *   reduce the memory target we risk the guest attempting to populate the 
+ *   memory before the balloon driver has reached its new target.  Safer to
+ *   never reduce the cache size here, but only when the balloon driver frees 
+ *   PoD ranges.
+ *
+ * If there are many zero pages, we could reach the target also by doing
+ * zero sweeps and marking the ranges PoD; but the balloon driver will have
+ * to free this memory eventually anyway, so we don't actually gain that much
+ * by doing so.
+ *
+ * NB that the equation (B<T') may require adjustment to the cache
+ * size as PoD pages are freed as well; i.e., freeing a PoD-backed
+ * entry when pod.entry_count == pod.count requires us to reduce both
+ * pod.entry_count and pod.count.
+ */
+int
+p2m_pod_set_mem_target(struct domain *d, unsigned long target)
+{
+    unsigned pod_target;
+    struct p2m_domain *p2md = d->arch.p2m;
+    int ret = 0;
+    unsigned long populated;
+
+    /* P == B: Nothing to do. */
+    if ( p2md->pod.entry_count == 0 )
+        goto out;
+
+    /* T' < B: Don't reduce the cache size; let the balloon driver
+     * take care of it. */
+    if ( target < d->tot_pages )
+        goto out;
+
+    populated  = d->tot_pages - p2md->pod.count;
+
+    pod_target = target - populated;
+
+    /* B < T': Set the cache size equal to # of outstanding entries,
+     * let the balloon driver fill in the rest. */
+    if ( pod_target > p2md->pod.entry_count )
+        pod_target = p2md->pod.entry_count;
+
+    ASSERT( pod_target > p2md->pod.count );
+
+    ret = p2m_pod_set_cache_target(d, pod_target);
+
+out:
+    return ret;
+}
+
 void
 p2m_pod_empty_cache(struct domain *d)
 {
@@ -538,6 +682,13 @@ p2m_pod_decrease_reservation(struct domain *d,
         }
     }    
 
+    /* If we've reduced our "liabilities" beyond our "assets", free some */
+    if ( p2md->pod.entry_count < p2md->pod.count )
+    {
+        printk("b %d\n", p2md->pod.entry_count);
+        p2m_pod_set_cache_target(d, p2md->pod.entry_count);
+    }
+
     /* If there are no more non-PoD entries, tell decrease_reservation() that
      * there's nothing left to do. */
     if ( nonpod == 0 )
@@ -786,7 +937,7 @@ p2m_pod_emergency_sweep_super(struct domain *d)
         /* Stop if we're past our limit and we have found *something*.
          *
          * NB that this is a zero-sum game; we're increasing our cache size
-         * by re-increasing our 'debt'.  Since we hold the p2m lock,
+         * by increasing our 'debt'.  Since we hold the p2m lock,
          * (entry_count - count) must remain the same. */
         if ( !list_empty(&p2md->pod.super) &&  i < limit )
             break;
diff --git a/xen/arch/x86/x86_64/compat/mm.c b/xen/arch/x86/x86_64/compat/mm.c
index 942c35a1e2..e14f6fab31 100644
--- a/xen/arch/x86/x86_64/compat/mm.c
+++ b/xen/arch/x86/x86_64/compat/mm.c
@@ -128,6 +128,29 @@ int compat_arch_memory_op(int op, XEN_GUEST_HANDLE(void) arg)
         break;
     }
 
+    case XENMEM_set_pod_target:
+    case XENMEM_get_pod_target:
+    {
+        struct compat_pod_target cmp;
+        struct xen_pod_target *nat = (void *)COMPAT_ARG_XLAT_VIRT_BASE;
+
+        if ( copy_from_guest(&cmp, arg, 1) )
+            return -EFAULT;
+
+        XLAT_pod_target(nat, &cmp);
+
+        rc = arch_memory_op(op, guest_handle_from_ptr(nat, void));
+        if ( rc < 0 )
+            break;
+
+        XLAT_pod_target(&cmp, nat);
+
+        if ( copy_to_guest(arg, &cmp, 1) )
+            rc = -EFAULT;
+
+        break;
+    }
+
     case XENMEM_machphys_mapping:
     {
         struct domain *d = current->domain;
diff --git a/xen/common/memory.c b/xen/common/memory.c
index 0d92808bd2..bf10bbbfd2 100644
--- a/xen/common/memory.c
+++ b/xen/common/memory.c
@@ -111,31 +111,40 @@ static void populate_physmap(struct memop_args *a)
         if ( unlikely(__copy_from_guest_offset(&gpfn, a->extent_list, i, 1)) )
             goto out;
 
-        page = alloc_domheap_pages(d, a->extent_order, a->memflags);
-        if ( unlikely(page == NULL) ) 
+        if ( a->memflags & MEMF_populate_on_demand )
         {
-            gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: "
-                     "id=%d memflags=%x (%ld of %d)\n",
-                     a->extent_order, d->domain_id, a->memflags,
-                     i, a->nr_extents);
-            goto out;
+            if ( guest_physmap_mark_populate_on_demand(d, gpfn,
+                                                       a->extent_order) < 0 )
+                goto out;
         }
+        else
+        {
+            page = alloc_domheap_pages(d, a->extent_order, a->memflags);
+            if ( unlikely(page == NULL) ) 
+            {
+                gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: "
+                         "id=%d memflags=%x (%ld of %d)\n",
+                         a->extent_order, d->domain_id, a->memflags,
+                         i, a->nr_extents);
+                goto out;
+            }
 
-        mfn = page_to_mfn(page);
-        guest_physmap_add_page(d, gpfn, mfn, a->extent_order);
+            mfn = page_to_mfn(page);
+            guest_physmap_add_page(d, gpfn, mfn, a->extent_order);
 
-        if ( !paging_mode_translate(d) )
-        {
-            for ( j = 0; j < (1 << a->extent_order); j++ )
-                set_gpfn_from_mfn(mfn + j, gpfn + j);
+            if ( !paging_mode_translate(d) )
+            {
+                for ( j = 0; j < (1 << a->extent_order); j++ )
+                    set_gpfn_from_mfn(mfn + j, gpfn + j);
 
-            /* Inform the domain of the new page's machine address. */ 
-            if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 1)) )
-                goto out;
+                /* Inform the domain of the new page's machine address. */ 
+                if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 1)) )
+                    goto out;
+            }
         }
     }
 
- out:
+out:
     a->nr_done = i;
 }
 
@@ -527,6 +536,10 @@ long do_memory_op(unsigned long cmd, XEN_GUEST_HANDLE(void) arg)
 
         args.memflags |= MEMF_node(XENMEMF_get_node(reservation.mem_flags));
 
+        if ( op == XENMEM_populate_physmap
+             && (reservation.mem_flags & XENMEMF_populate_on_demand) )
+            args.memflags |= MEMF_populate_on_demand;
+
         if ( likely(reservation.domid == DOMID_SELF) )
         {
             d = rcu_lock_current_domain();
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
index 2d4fd382f0..d2778a92c0 100644
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -261,6 +261,10 @@ void p2m_pod_dump_data(struct domain *d);
  * (usually in preparation for domain destruction) */
 void p2m_pod_empty_cache(struct domain *d);
 
+/* Set populate-on-demand cache size so that the total memory allocated to a
+ * domain matches target */
+int p2m_pod_set_mem_target(struct domain *d, unsigned long target);
+
 /* Call when decreasing memory reservation to handle PoD entries properly.
  * Will return '1' if all entries were handled and nothing more need be done.*/
 int
diff --git a/xen/include/public/memory.h b/xen/include/public/memory.h
index d7b9fff972..6edd59b5ef 100644
--- a/xen/include/public/memory.h
+++ b/xen/include/public/memory.h
@@ -48,6 +48,8 @@
 /* NUMA node to allocate from. */
 #define XENMEMF_node(x)     (((x) + 1) << 8)
 #define XENMEMF_get_node(x) ((((x) >> 8) - 1) & 0xffu)
+/* Flag to populate physmap with populate-on-demand entries */
+#define XENMEMF_populate_on_demand (1<<16)
 #endif
 
 struct xen_memory_reservation {
@@ -299,6 +301,19 @@ struct xen_foreign_memory_map {
 typedef struct xen_foreign_memory_map xen_foreign_memory_map_t;
 DEFINE_XEN_GUEST_HANDLE(xen_foreign_memory_map_t);
 
+#define XENMEM_set_pod_target       16
+#define XENMEM_get_pod_target       17
+struct xen_pod_target {
+    /* IN */
+    uint64_t target_pages;
+    /* OUT */
+    uint64_t tot_pages;
+    uint64_t pod_cache_pages;
+    uint64_t pod_entries;
+    /* IN */
+    domid_t domid;
+};
+typedef struct xen_pod_target xen_pod_target_t;
 #endif /* __XEN_PUBLIC_MEMORY_H__ */
 
 /*
diff --git a/xen/include/xen/hypercall.h b/xen/include/xen/hypercall.h
index 3997b2f96a..99d2e0008a 100644
--- a/xen/include/xen/hypercall.h
+++ b/xen/include/xen/hypercall.h
@@ -48,7 +48,7 @@ do_platform_op(
  * at what point in the page list to resume. For this purpose I steal the
  * high-order bits of the @cmd parameter, which are otherwise unused and zero.
  */
-#define MEMOP_EXTENT_SHIFT 4 /* cmd[:4] == start_extent */
+#define MEMOP_EXTENT_SHIFT 6 /* cmd[:6] == start_extent */
 #define MEMOP_CMD_MASK     ((1 << MEMOP_EXTENT_SHIFT) - 1)
 
 extern long
diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
index 08bd72d8ce..0b78e8647e 100644
--- a/xen/include/xen/mm.h
+++ b/xen/include/xen/mm.h
@@ -72,6 +72,8 @@ int assign_pages(
 /* memflags: */
 #define _MEMF_no_refcount 0
 #define  MEMF_no_refcount (1U<<_MEMF_no_refcount)
+#define _MEMF_populate_on_demand 1
+#define  MEMF_populate_on_demand (1U<<_MEMF_populate_on_demand)
 #define _MEMF_node        8
 #define  MEMF_node(n)     ((((n)+1)&0xff)<<_MEMF_node)
 #define _MEMF_bits        24
diff --git a/xen/include/xlat.lst b/xen/include/xlat.lst
index b10d215254..0bfa78f6c6 100644
--- a/xen/include/xlat.lst
+++ b/xen/include/xlat.lst
@@ -38,6 +38,7 @@
 !	memory_exchange			memory.h
 !	memory_map			memory.h
 !	memory_reservation		memory.h
+!	pod_target			memory.h
 !	translate_gpfn_list		memory.h
 !	sched_poll			sched.h
 ?	sched_remote_shutdown		sched.h